Sentiment Analysis on Movie Reviews

Using Logistic Regression Model

  • 0 - negative

  • 1 - somewhat negative

  • 2 - neutral

  • 3 - somewhat positive

  • 4 - positive

Load Libraries


In [29]:
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

Load & Read Datasets


In [8]:
train = pd.read_csv('train.tsv', delimiter='\t')
test = pd.read_csv('test.tsv', delimiter='\t')

In [10]:
train.shape, test.shape


Out[10]:
((156060, 4), (66292, 3))

In [20]:
train.head()


Out[20]:
PhraseId SentenceId Phrase Sentiment
0 1 1 A series of escapades demonstrating the adage ... 1
1 2 1 A series of escapades demonstrating the adage ... 2
2 3 1 A series 2
3 4 1 A 2
4 5 1 series 2

In [13]:
test.head()


Out[13]:
PhraseId SentenceId Phrase
0 156061 8545 An intermittently pleasing but mostly routine ...
1 156062 8545 An intermittently pleasing but mostly routine ...
2 156063 8545 An
3 156064 8545 intermittently pleasing but mostly routine effort
4 156065 8545 intermittently pleasing but mostly routine

In [15]:
# unique sentiment labels
train.Sentiment.unique()


Out[15]:
array([1, 2, 3, 4, 0])

In [21]:
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 6.0+ MB

In [22]:
train.Sentiment.value_counts()


Out[22]:
2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [23]:
train.Sentiment.value_counts() / train.Sentiment.count()


Out[23]:
2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

Train Classifier


In [24]:
X_train = train['Phrase']
y_train = train['Sentiment']

In [30]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])

text_clf = text_clf.fit(X_train, y_train)

In [31]:
X_test = train['Phrase']
predicted = text_clf.predict(X_test)

In [32]:
print (np.mean(predicted == y_train))


0.668787645777

In [34]:
test.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 66292 entries, 0 to 66291
Data columns (total 3 columns):
PhraseId      66292 non-null int64
SentenceId    66292 non-null int64
Phrase        66292 non-null object
dtypes: int64(2), object(1)
memory usage: 2.0+ MB

Create Submission


In [35]:
X_test = test['Phrase']
phraseIds = test['PhraseId']
predicted = text_clf.predict(X_test)
output = pd.DataFrame( data={"PhraseId":phraseIds, "Sentiment":predicted} )
#output.to_csv( "submission_logistic_regression.csv", index=False, quoting=3 )